cd "/disk/agedisk3/medicare.work/chandra-DUA52080/pragya-dua52080/replication/"
set seed 30248
capture log close
set more off
clear

** must be run on stata16mp
log using make_data.log, replace

global RISKADJUSTERS "ARSINDIC_* como_* card_*"
global RISKADJUSTERS_distance "ARSINDIC_* como_* card_* log_distance"
global RISKADJUSTERS_stemi "ARSINDIC_* como_* card_* STEMI"
global RISKADJUSTERS_interactions "ARSINDIC_* como_* card_*  bcomo_* bcard_* ocomo_* ocard_*"
global ARSRISKADJUSTERS "ARSINDIC_*"
global RISKADJUSTERS_racespecific "b_ASINDIC* o_ASINDIC* ASINDIC_* como_* card_* b o"

adopath ++ Programs

** This part of the code makes a file with all relevant data at the level of each index event 
global MAKEEVENTS = 1

** This part of the code creates a data at the level of provider - year with the various quality measures considered
global MAKELONG = 1


**************************************************************************************************
**************************************************************************************************
****************************************** CODE START ********************************************
**************************************************************************************************
**************************************************************************************************



if $MAKEEVENTS == 1 {

	** NOT PUBLIC AVAILABLE DATA: This is a dataset that is not publicly available
	* bring in index events
	use diag_id diag_year bene_zipclmindx_num medrawindx fileyear ageAtdiag surv_30 age_grp  male race  using /disk/agedisk3/medicare.work/chandra-DUA52080/pragya-dua52080/mlearn/source_shared/cohort-extraction/V07.00/master/1cohort_extraction/indx/ami100_indxdemo.dta, replace
	isid diag_id

		
	* zip of beneficiary
	rename bene_zipclmindx_num zip_bene
	destring zip_bene, replace
	rename diag_year year

	** merge bene's zip code to an HRR 
	** prefer the current year HRR but if zip is not there, use the last year hte zip was available
	* this is aggregation of public data here: https://data.dartmouthatlas.org/supplemental/

	rename zip_bene zip
	merge m:1 zip year using Exportable_RawData/zip2hsahrr.dta, keepusing(hrrnum) keep(master match) generate(match_bene_hrr)

	* merge to a zip code to the last year a zipcode was mentioned in zipcode-hrr map for zipcodes that for
	* some reason were not listed in that year's map

	merge m:1 zip using Exportable_RawData/zip2hsahrr_lastyear.dta, keepusing(hrrnum) keep(master match match_update match_conflict) generate(match_bene_hrr_lastyear) update	
	display "Observations for which bene couldn't be matched to HRR/HSA (dropping):"
	count if match_bene_hrr_lastyear==1
	drop if match_bene_hrr_lastyear==1
	drop match_bene_hrr match_bene_hrr_lastyear

	rename zip zip_bene
	rename hrrnum hrrnum_bene

	* generates alternate race & age measures *
		generate race_alt = 1 if race == "1"
		replace race_alt = 2 if race == "2"
		replace race_alt = 3 if race_alt == . 
		drop race
		
		gen byte w = race_alt == 1
		gen byte b = race_alt == 2
		gen byte o = race_alt == 3
		gen byte a = 1
		

	** NOT PUBLIC AVAILABLE DATA: This is a dataset that is not publicly available
	* bring in provider of index event, discharge date, and source of admission
	merge 1:1 diag_id medrawindx fileyear using /disk/agedisk3/medicare.work/chandra-DUA52080/pragya-dua52080/mlearn/source_shared/cohort-extraction/V07.00/master/1cohort_extraction/medpar/ami100_medparclms.dta, assert(match using) keepusing(prvnumgrp is_shorthosp is_cah admsndt  dgnscd1 prcdrcd1 prcdrcd2 prcdrcd3 prcdrcd4 prcdrcd5 prcdrcd6 prcdrdt1 prcdrdt2 prcdrdt3 prcdrdt4 prcdrdt5 prcdrdt6 dgnscd1 dgnscd2  dgnscd3  dgnscd4  dgnscd5  dgnscd6  dgnscd7 dgnscd8 dgnscd9  dgnscd10  dgnscd11  dgnscd12  dgnscd13  dgnscd14 dgnscd15 dgnscd16  dgnscd17  dgnscd18  dgnscd19  dgnscd20  dgnscd21 dgnscd22 dgnscd23 dgnscd24 dgnscd25 ) keep(match) nogenerate
	rename prvnumgrp pn
			
	** limit to stays at short term acute care hospitals or critical access hospitals
		keep if is_shorthosp | is_cah
		drop is_shorthosp is_cah
	
	** generates a variable for whether it is stemi or not 
	generate STEMI = 0
	foreach num of numlist 1/25{
		replace STEMI = 1 if substr(dgnscd`num', 1, 4) == "4107" 
	}
		
	** Generates variables for whether patient received cath/PCI
	** same day cath
	generate cath_pci = 0 
	** any day cath
	generate cath_pci_ad = 0 
	local cathlist "3721 3722 3723 885 8850 8851 8852 8853 8854 8855 8856 8857 8858 8859 0066 360 3600 3601 3602 3605 3606 3607 3609"			
	foreach code in `cathlist'{
	foreach num of numlist 1/6{
			replace cath_pci = 1 if prcdrcd`num' == "`code'" & admsndt == prcdrdt`num' 
			replace cath_pci_ad = 1 if prcdrcd`num' == "`code'" 
	}
	}
	
	** Generates variable for whether patient received  CABG (any day of admission)
	
	generate CABG = 0 
	local cabglist "3603 3610 3611 3612 3613 3614 3615 3616 3617 3618 3619 362 363 3631 3632 3633 3634 3635 3636 3637 3638 3639"			
	foreach code in `cabglist'{
	foreach num of numlist 1/6{
			replace CABG = 1 if prcdrcd`num' == "`code'"
	}
	}
	
	** generates a variable for CABG or cath_pci any day 				
	gen byte cardtech = cath_pci_ad == 1 |  CABG == 1
	
	* merge provider to POS to get indicator for non-US state
	** POS files are available publicly: https://www.nber.org/research/data/provider-services-files
	merge m:1 pn year using Exportable_RawData/pos.dta, keepusing(nonstate) keep(master match) generate(match_pos)

	* merge to POS for pn's that for some reason were not included in the POS in that year
	* merge to the last mention of the POS
	merge m:1 pn using Exportable_RawData/pos_lastyear.dta, keepusing(nonstate) keep(master match match_update match_conflict) generate(match_pos_lastyear) update

	display "Observations that did not match to provider of service file (dropping):"
	count if match_pos_lastyear==1
	drop if match_pos_lastyear==1
	drop match_pos match_pos_lastyear
		
	* get rid of patients treated outside the US
	display "Observations at hospitals outside US states:"
	count if nonstate==1
	drop if nonstate==1
	drop nonstate
	
	
	** NOT PUBLIC AVAILABLE DATA: This is a dataset that is not publicly available	
	* bring in zip, hrr, hsa of provider (pn_new)
		
	merge m:1 pn using "Intermediate_Output_Not_Exportable/pn2pn_new.dta", keep(master match) generate(match_dxwlk)
	display "Observations that did not match to dartmouth crosswalk to pn_new (dropping):"
	count if match_dxwlk==1
	drop if match_dxwlk==1
	drop match_dxwlk
			
	merge m:1 pn_new using "Intermediate_Output_Not_Exportable/pn_new.dta", assert(match using) keep(match) nogenerate keepusing(zip hrrnum hsanum)	
	rename zip zip_pn
	rename hrrnum hrrnum_pn
	rename hsanum hsanum_pn
		
	* switch provider identifier
	rename pn pn_orig
	rename pn_new pn
	
	gen byte samehrr = hrrnum_bene==hrrnum_pn
	
	** merges in MA penetration from CMS for 2014 for the elderly (age >65)
	** this is a stata version of publicly available data: https://data.cms.gov/summary-statistics-on-use-and-payments/medicare-geographic-comparisons/medicare-geographic-variation-by-hospital-referral-region/data
	
	** Merging beneficiary & provider HRR with MA penetration 
	rename hrrnum_bene hrrnum
	merge m:1 hrrnum using Exportable_RawData/hrr_ma_penetration_2014.dta, nogenerate assert(match using) keep(match)
	rename ma_ptcp ma_ptcp_bene
	rename hrrnum hrrnum_bene
	rename hrrnum_pn hrrnum
	merge m:1 hrrnum using Exportable_RawData/hrr_ma_penetration_2014.dta, nogenerate assert(match using) keep(match)
	rename ma_ptcp ma_ptcp_pn 
	rename hrrnum hrrnum_pn		

	** indicators for whether the person is above or below median HRR
	gen byte Ma = ma_ptcp_bene >= .29605 & ma_ptcp_pn >= .29605
	gen byte Mb = ma_ptcp_bene < .29605 & ma_ptcp_pn < .29605
	gen byte w_Mb = w & ma_ptcp_bene < .29605 & ma_ptcp_pn < .29605
	gen byte b_Mb = b & ma_ptcp_bene < .29605 & ma_ptcp_pn < .29605	
	
	** does geographic reweighting
	
	gen yidx = floor((year-1990)/5)
	local geo_levels "zip hrrnum"			
	foreach level in `geo_levels'{		
		** generates weights for white & black patients where white patients ** 
		** are weighted based on the distribution of black patients ** 
		sort `level'_bene yidx
		by `level'_bene yidx: egen `level'_pop_w = sum(w)
		by `level'_bene yidx: egen `level'_pop_b = sum(b)
		replace `level'_pop_w = . if `level'_pop_w == 0 		

		by `level'_bene yidx: egen `level'_pop_w_Mb = sum(w_Mb)
		by `level'_bene yidx: egen `level'_pop_b_Mb = sum(b_Mb)
		replace `level'_pop_w_Mb= . if `level'_pop_w_Mb == 0 
		
		generate `level'_weight_w = `level'_pop_b / `level'_pop_w if w
		replace `level'_weight_w = 0 if `level'_weight_w == . 
		gen byte `level'_weight_b = `level'_pop_w != . & b
		replace `level'_pop_w = 0 if `level'_pop_w == . 
	
		generate `level'_weight_w_Mb = `level'_pop_b_Mb / `level'_pop_w_Mb if w_Mb
		replace `level'_weight_w_Mb = 0 if `level'_weight_w_Mb == . 
		gen byte `level'_weight_b_Mb = `level'_pop_w_Mb != . & b_Mb
		replace `level'_pop_w_Mb = 0 if `level'_pop_w_Mb == .  

	}
	

	
	drop yidx	
							
	* bring in lat and lon coordinates of patient and provider
		
	foreach side in bene pn {
			
			rename zip_`side' zip
				
			* merge to lat/lon
	
			merge m:1 zip using Intermediate_Output_Not_Exportable/zip.dta, keep(master match) keepusing(lat lon src_sas) nogenerate
						
			rename zip zip_`side'						
			rename lon lon_`side'
			rename lat lat_`side'
	
			* convert to radians
			replace lon_`side' = lon_`side'*(_pi/180)
			replace lat_`side' = lat_`side'*(_pi/180)
			
	}
	generate distance = acos(sin(lat_bene)*sin(lat_pn) + cos(lat_bene)*cos(lat_pn)*cos(lon_pn-lon_bene) )*3958.756
	
	
	* bring in risk adjusters
	** NOT PUBLIC AVAILABLE DATA: This is a dataset that is not publicly available
	** Also dropping AMI since it should be almost 0 (is almost 0)
	merge 1:1 diag_id using /disk/agedisk3/medicare.work/chandra-DUA52080/pragya-dua52080/mlearn/source_shared/cohort-extraction/V07.00/master/2create_index_level_measures/last_yr_comorbid/ami100_medpar_cc.dta, assert(match using) keep(match) nogenerate keepusing(como_* card_hf card_ua card_cathero card_respFal card_hyperhd card_valve card_arrhythmia)

	** Groups certain comos together, otherwise run into collinearity issues
	generate card_hyperhd_hf = max(card_hyperhd, card_hf)
	rename card_hyperhd deprecate_card_hyperhd
	rename card_hf deprecate_card_hf
	
	gen yidx = floor((year-1990)/5)
	compress
	save Intermediate_Output_Not_Exportable/ami100.dta, replace

	
}


if $MAKELONG == 1 {

	use Intermediate_Output_Not_Exportable/ami100.dta, replace
	* create the "in sample" indicator & drops observations outside period of study
	gen byte insample_ami_a = yidx > 0 & yidx < 5
	keep if insample_ami_a == 1
			
	egen arsgroup = group(age_grp race_alt male)
		
	* make age/race/sex group indicators
	qui tab arsgroup , gen(ARSINDIC_)
	** drop an omitted category
	drop ARSINDIC_1
	** age-sex interactions for race-specific regression
	egen asgroup = group(age_grp male)
	qui tab asgroup , gen(ASINDIC_)

	foreach num of numlist 1/14{
		generate b_ASINDIC_`num' = b*ASINDIC_`num'
		generate o_ASINDIC_`num' =  o*ASINDIC_`num'
	}
	
	
	** Generates race-como interactions	
	local comorbidities como_hyper como_stroke como_cervas como_renal como_dialysis como_COPD como_pnuemo como_diabetes como_protein como_dementia como_FDLsDis como_periph como_metaCancer como_trauma como_subs como_mPsych como_cLiver card_valve card_ua card_cathero card_respFal card_hyperhd_hf card_arrhythmia
	foreach comorbidity of local comorbidities{
			generate b`comorbidity' = `comorbidity'*b
			generate o`comorbidity' = `comorbidity'*o
	}
	
	** sample for most measures of quality
	egen pnXyidx = group(pn yidx)
	egen npats_ami_a = sum(insample_ami_a), by(pnXyidx)
	** generates indicator for whether hospitals has more than 25 pateints in sample 
	replace insample_ami_a = (npats_ami_a >= 25) 
	
	** sample for analysis where we limit to white patients
	egen npats_ami_white = sum(w), by(pnXyidx)	
	gen byte insample_ami_white2 = insample_ami_a & w
	replace insample_ami_white2 = (npats_ami_white >= 1) & w

	** sample for analysis where we control for distance
	gen byte insample_ami_distance = distance != . & distance <= 100 
	egen npats_ami_distance = sum(insample_ami_distance), by(pnXyidx)
	replace insample_ami_distance = (npats_ami_distance >= 25) & distance != . & distance <= 100 

	** xtset at the provider level
	egen pn_group = group(pn)
	xtset pn_group

		* the measures are:
			* surv - risk-adjusted survival (standard fixed effect approach)
			* survre - risk-adjusted survival (random effects)
			* survi - risk-adjusted survival (fixed effects + comorbidity-race interactions)
			* sars - survival (age/race/sex adjusted)
			* snra - survival (no risk adjustment)
			* cath_pci - risk-adjusted cath-pci use same day
			* cardtech - risk-adjusted cath-pci-cabg use  any day
			* CABG - risk-adjusted CABG use any day
			* cath_pci_ad - risk-adjusted cath-pci use any day
			* survd - risk-adjusted survival (standard fixed effect approach) + distance
			* survrs - race specific 
			* survw2 - white only
			* stemi - control for stemi 
		
	local measures "survrs survw2 stemi surv survre survi sars snra cath_pci cardtech CABG cath_pci_ad survd "
	
	foreach meas in `measures'{
	
				** SET THE RISK ADJUSTERS FOR THE MEASURE
				
				* risk adjusters for the measures
				if ("`meas'"=="sars") {
					* age/race/sex only
					local riskadjusters "$ARSRISKADJUSTERS"
				}
				else if ("`meas'"=="snra") {
					* no risk adjusters
					* weird bug: xtreg with no rhs variables throws up an error
					* need to make this variable (which is then dropped by xtreg!)
					local riskadjusters "ones"
					gen byte ones = 1
				}
				else if ("`meas'" == "survi" ){
					local riskadjusters "$RISKADJUSTERS_interactions"
				}
				else if ("`meas'" == "survrs"){
					local riskadjusters "$RISKADJUSTERS_racespecific"
				}	
				else if ("`meas'" == "stemi"){
					local riskadjusters "$RISKADJUSTERS_stemi"
				}	
				else if ("`meas'" == "survd"){
					generate distance_sq = distance^2
					generate log_distance = ln(distance+1) if distance != . 
					local riskadjusters "$RISKADJUSTERS_distance"
				}			
				else {
					* full set of risk adjusters
					local riskadjusters "$RISKADJUSTERS"
				}


				** DEFINE THE OUTCOME
			
				if ("`meas'" == "survd" | "`meas'"=="surv"  | "`meas'" == "survw2" | "`meas'" == "snra" | "`meas'" == "sars" | "`meas'" == "survi" | "`meas'" == "survrs" | "`meas'" == "stemi") {
					local lhs = "surv_30"
				}
				else if ("`meas'" == "cath_pci" | "`meas'" == "pci_cabg" | "`meas'" == "cardtech" | "`meas'" == "CABG" | "`meas'" == "cath_pci_ad"){
					local lhs = "`meas'" 
				}
				
				** DEFINE WHETHER IT IS FIXED OR RANDOM EFFECT
				if ("`meas'" == "survrs" | "`meas'" == "survre"){
				local reg_type = "re"
				}
				else {
				local reg_type = "fe"
				}

				** DEFINE THE SAMPLE 
				local insamp = "insample_ami_a"	
				if "`meas'"	== "survd"{
					local insamp = "insample_ami_distance"
				}		
				if "`meas'" == "survw2"{
					local insamp = "insample_ami_white2"
				}

				** DEFINE PLACEHOLDER FOR THE ESTIMATE & SE 
				 
				if "`meas'" == "surv"{
					tempvar `reg_type' se ehat
					gen long long`meas'_ami_se_a = . 
				}
				if  "`meas'" != "survrs" {
					gen long`meas'_ami_fere_a = .
				}
				if "`meas'" == "survrs"{
					gen long`meas'w_ami_fere_a = .	
					gen long`meas'b_ami_fere_a = .	
				}
				
				
				** This file is saved and then is called upon in the race_decomp.do file and race_quality_covariance_simple.do for post estimation
				** We use this file for convenience because it has all the risk adjusters and same variable names as used in the model ** 
				
				if "`meas'" == "surv" | "`meas'" == "survrs" {
					save Intermediate_Output_Not_Exportable/pre_quality_calc_ami100_a_`meas'.dta, replace
				}
				
				foreach yidx_cur in 4 1 2 3{	
				if `yidx_cur' == 1 | `yidx_cur' == 4 | "`meas'" == "surv"{
				
					** EVERYTHING EXCEPT THE RACE SPECIFIC MODEL 
					
					if ("`meas'" != "survrs"){
					
					* run the regression!
					display "`meas'"
					xtreg `lhs' `riskadjusters' if `insamp' & yidx==`yidx_cur', `reg_type' vce(conventional) 
					* regression sample should equal the insample indicator
					return list
					matrix coefficients = r(table)
					assert e(sample) == (`insamp' & yidx==`yidx_cur')
					
					capture mkdir Exportable_Results/
					capture mkdir Exportable_Results/estimates/
					estimate save Exportable_Results/estimates/long`meas'_ami100_yidx`yidx_cur'_a.ster, replace 
			
					predict `reg_type' if `insamp' & yidx==`yidx_cur', u

					* Copy into the full FE variable
					* Note the regression was run with a constant term so we must
					* add it to the FE
					replace long`meas'_ami_fere_a= `reg_type' + _b[_cons] if `insamp' & yidx==`yidx_cur'
					drop `reg_type'					
					
						** now deals with standard errors
						if "`meas'" == "surv"  {
							* extract the error term fits, which speeds up fese
							predict `ehat' if `insamp' & yidx==`yidx_cur', e
							* since we have risk-adjusters  we can use fese
							display "Applying fese_adam"
							fese_adam `lhs' `riskadjusters' if `insamp' & yidx==`yidx_cur', ehat(`ehat') homo(`se')
							* copy into the full SE variable
							replace long`meas'_ami_se_a = `se' if `insamp' & yidx==`yidx_cur'
							drop `se' `ehat'
						}					

					}
					
					** RACE SPECIFIC MODEL 
					if ("`meas'" == "survrs"){
						* run the regression!
						display "`meas'"
						
						xtmixed `lhs' `riskadjusters' if `insamp' & yidx==`yidx_cur' || pn_group: b o, covariance(unstructured) reml
						capture mkdir Exportable_Results/
						capture mkdir Exportable_Results/estimates/
						estimate save Exportable_Results/estimates/long`meas'_ami100_yidx`yidx_cur'_a.ster, replace 
	
						predict re_b re_o re_w if `insamp' & yidx==`yidx_cur' , reffects
						
							** Adds the random effect and constant
							replace long`meas'w_ami_fere_a = _b[_cons] + re_w if `insamp' & yidx==`yidx_cur'
							
							** Generates the black measure
							** Pulls the white measure (black measure is only black specific terms + white terms)				
							replace long`meas'b_ami_fere_a = _b[_cons] + _b[b] + re_b + re_w  if `insamp' & yidx==`yidx_cur'										
							drop re_b re_o re_w	
														
					}

					
				}
				}
				

	
	}
	
	collapse (mean) *_ami_fere_* *_se_* (first) hsanum_pn hrrnum_pn lat_pn lon_pn zip_pn ma_ptcp_pn (sum) npatients_a_ami_tot_Mb = Mb npatients_w_ami_tot_Mb = w_Mb npatients_b_ami_tot_Mb = b_Mb npatients_a_ami_tot_act = a npatients_b_ami_tot_act = b npatients_w_ami_tot_act = w  npatients_b_ami_tot_wgt = zip_weight_b npatients_w_ami_tot_wgt = zip_weight_w  npatients_b_ami_tot_wgtH = hrrnum_weight_b npatients_w_ami_tot_wgtH = hrrnum_weight_w    npatients_w_ami_tot_wgtMb  = zip_weight_w_Mb npatients_b_ami_tot_wgtMb  = zip_weight_b_Mb  npatients_w_ami_tot_wgtHMb  = hrrnum_weight_w_Mb npatients_b_ami_tot_wgtHMb  = hrrnum_weight_b_Mb, by(pn yidx)						
	
	egen pn_period = group(pn yidx)	
	xtset pn_period


			local new_meas "surv"
			foreach meas in `new_meas' {
																		
				* perform univariate adjustment
				* only adjust within the analysis sample
				
				
				display "performing univariate EB adjustment for `longmeas' ami"
				capture noisily ebayes long`meas'_ami_fere_a long`meas'_ami_se_a ///
					if long`meas'_ami_fere_a!=. , ///
					 by(yidx) ///
					gen(long`meas'_ami_eb_a) var(long`meas'_ami_var_a) uvar(long`meas'_ami_uvar_a) simple
			
				if (_rc!=0) {
					* adjustment failed
					display "EB procedure failed to converge"
				}
				
			}
			
		
	save Intermediate_Output_Not_Exportable/bs0.dta, replace
	
	
}



log close

